# quantmod used to download stock data
#install.packages("quantmod")
library(quantmod)
# highcharter used to create interactive charts
#install.packages("highcharter")
library(highcharter)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
method from
print.data.table
# dygraphs used to create interactive candlestick charts
#install.packages("dygraphs")
library(dygraphs)
The art of transforming data into financial decisions using financial data analysis to answer questions like:
# DJIA = SticlPrice1 + ... + StockPrice30 / Dow Divisor
dow_divisor <- 0.145
#Ex. KO, or any Blue Chip stock, increases by $1
DJIA <- round(1 / dow_divisor,2)
DJIA
[1] 6.9
# Market Cap = Stock Price * # of Outstanding Shares
# Outstanding Shares = number of ALL shares issued by a company
# S&P is based on ADJUSTED market cap of 500 largest US companies (505 becuase some companies have 2 stocks, like BRK-A and BRK-B)
# No. of Floating SHares are shares available for PUBLIC trading
# S&P Index = Stock Price 1 * No. of Floating Shares 1 + ... + Stock Price 505 * No. of Floating Shares 505 / Index Divisor
# Download data for DJIA (^DJI") and S&P (^GSPC)
getSymbols(c("^DJI","^GSPC"), from = "2000-1-1", to = "2020-4-1")
‘getSymbols’ currently uses auto.assign=TRUE by default, but will
use auto.assign=FALSE in 0.5-0. You will still be able to use
‘loadSymbols’ to automatically load data. getOption("getSymbols.env")
and getOption("getSymbols.auto.assign") will still be checked for
alternate defaults.
This message is shown once per session and may be disabled by setting
options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
[1] "^DJI" "^GSPC"
# Inspect DJI
head(DJI)
DJI.Open DJI.High DJI.Low DJI.Close DJI.Volume DJI.Adjusted
2000-01-03 11501.85 11522.01 11305.69 11357.51 1697500 11357.51
2000-01-04 11349.75 11350.06 10986.45 10997.93 1784200 10997.93
2000-01-05 10989.37 11215.10 10938.67 11122.65 2031900 11122.65
2000-01-06 11113.37 11313.45 11098.45 11253.26 1765500 11253.26
2000-01-07 11247.06 11528.14 11239.92 11522.56 1849000 11522.56
2000-01-10 11532.48 11638.28 11532.48 11572.20 1681800 11572.20
# Inspect GSPC
head(GSPC)
GSPC.Open GSPC.High GSPC.Low GSPC.Close GSPC.Volume GSPC.Adjusted
2000-01-03 1469.25 1478.00 1438.36 1455.22 931800000 1455.22
2000-01-04 1455.22 1455.22 1397.43 1399.42 1009000000 1399.42
2000-01-05 1399.42 1413.27 1377.68 1402.11 1085500000 1402.11
2000-01-06 1402.11 1411.90 1392.10 1403.45 1092300000 1403.45
2000-01-07 1403.45 1441.47 1400.73 1441.47 1225200000 1441.47
2000-01-10 1441.47 1464.36 1441.47 1457.60 1064800000 1457.60
# Create financial candlestick chart for DJI
chartSeries(DJI, type = "candlesticks", subset = "2020-1",theme="white")
# Create financial bar chart for DJI
# Tick to <- left is open value, tick to -> right is close value
chartSeries(DJI, type = "bars", subset = "2020-1::2020-3")
# Create interactive chart for DJI (open, high,low, and close)
dyRangeSelector(dyCandlestick(dygraph(DJI[ ,1:4])))
# Create candlestick chart with highcharter
hchart(DJI)
# "/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 3/va.xlsx"
va_file = "/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 3/va.csv"
# Download Virgin America data from an Excel/CSV File
VA <- read.csv(file = va_file)
head(VA)
tail(VA)
# Check the type of data. It is a DataFrame, not a time series object
str(VA)
'data.frame': 524 obs. of 6 variables:
$ Day : chr "17-Nov-14" "18-Nov-14" "19-Nov-14" "20-Nov-14" ...
$ Open : num 30.3 33.7 38.5 32.8 35.8 ...
$ High : num 32.9 39.6 39.5 35 36.1 ...
$ Low : num 30 33.5 32.9 32.4 33.9 ...
$ Volume: int 5640000 12830000 9080000 3120000 1870000 2230000 2060000 921480 2370000 2730000 ...
$ Close : num 32.7 37 33.2 34 34.5 ...
# using the lookup table PDF, match the Day column with the encoding of the formatting to reformat the Day column
VA$Day <- as.Date(VA$Day, format = "%d-%b-%y")
tail(VA)
# Convert VA into xts object (accessible time series) from a DataFrame
# xts( Core data (non-time columns), ), this is from importing quantmod
Branson <- xts(VA[ ,-1], order.by = VA$Day )
str(Branson)
An ‘xts’ object on 2014-11-17/2016-12-13 containing:
Data: num [1:524, 1:5] 30.3 33.7 38.5 32.8 35.8 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:5] "Open" "High" "Low" "Volume" ...
Indexed by objects of class: [Date] TZ: UTC
xts Attributes:
NULL
tail(Branson)
Open High Low Volume Close
2016-12-06 56.80 56.95 56.70 7130000 56.80
2016-12-07 56.75 57.00 56.75 3020000 56.95
2016-12-08 57.00 57.05 56.95 2110000 57.00
2016-12-09 56.95 57.00 56.95 605760 57.00
2016-12-12 56.95 57.05 56.95 1320000 56.98
2016-12-13 56.95 57.05 56.95 417690 56.98
chartSeries(Branson, type = "candlesticks", subset = "2015-8")
# https://www.treasury.gov/resource-center/data-chart-center/interest-rates/Pages/TextView.aspx?data=yield
face_value <- 100
discount_rate <- 2.28 / 100
discount <- face_value * discount_rate
maturity <- 52
purchase_price <- face_value - discount
# Total Return = (Ending Value - Beginning Value) / Beginning Value
total_return <- (face_value - purchase_price) / purchase_price
round(total_return * 100,2)
[1] 2.33
# Convert Annualized return to monthly return
# 4 * 3-month return: 2.42 / 4 = 0.605%
# Ending Value = Beginning Value * (1 + Total Return)
# Approximate Solution
annualized_return <- 0.0242
months_in_a_year = 12
round(annualized_return / months_in_a_year, 6) * 100
[1] 0.2017
# Exact solution
monthly_return <- 0.00605
round((1 + monthly_return) ^ (1/3) - 1,6)*100
[1] 0.2013
# Federal Reserve Bank of St. Louis. The symbol for FRED is DGS3MO
# R Will download all available data (no to and from)
# This is the annualized return for 3 month treasury bills
getSymbols("DGS3MO", src = "FRED")
[1] "DGS3MO"
head(DGS3MO)
DGS3MO
1981-09-01 17.01
1981-09-02 16.65
1981-09-03 16.96
1981-09-04 16.64
1981-09-07 NA
1981-09-08 16.54
# Convert them into approximate monthly returns
# Divide them by 100 to get the proper percentage format
risk.free <- DGS3MO / 100
risk.free <- DGS3MO / 12
# Divide by 12 to get approximate monthly return
# Or you can do (100 * 12 = 1200)
risk.free <- DGS3MO / 1200
head(risk.free)
DGS3MO
1981-09-01 0.01417500
1981-09-02 0.01387500
1981-09-03 0.01413333
1981-09-04 0.01386667
1981-09-07 NA
1981-09-08 0.01378333
“RISK-FREE” TREASURY BILLS WITH GUARANTEED POSITIVE RETURN
RISKY APPLE STOCKS WITH HIGHER EXPECTED RETURN PREDICTED ; NOT GUARANTEED !
IS APPLE STOCKS’ EXPECTED RETURN HIGH ENOUGH TO JUSTIFY RISK ?
You will decide on if the expected return on Apple, Inc. stocks is high enough to justify risk. Use Capital Asset Pricing Model and predict the “fair” return on Apple, Inc. stocks. To do so, use the monthly return data from May 1, 2015 through April 30, 2020.
Use R and download the index data for S&P500 and price data for Apple, Inc. stocks. Download the data between the first trading day of May 2015 and the last trading day of Apr 2020.
library(quantmod)
library(PerformanceAnalytics)
# S&P 500 Index and Apple Inc.
getSymbols(c("^GSPC","AAPL"), from = "2015-5-1", to = "2020-5-1")
[1] "^GSPC" "AAPL"
head(GSPC)
GSPC.Open GSPC.High GSPC.Low GSPC.Close GSPC.Volume GSPC.Adjusted
2015-05-01 2087.38 2108.41 2087.38 2108.29 3379390000 2108.29
2015-05-04 2110.23 2120.95 2110.23 2114.49 3091580000 2114.49
2015-05-05 2112.63 2115.24 2088.46 2089.46 3793950000 2089.46
2015-05-06 2091.26 2098.42 2067.93 2080.15 3792210000 2080.15
2015-05-07 2079.96 2092.90 2074.99 2088.00 3676640000 2088.00
2015-05-08 2092.13 2117.66 2092.13 2116.10 3399440000 2116.10
head(AAPL)
AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume AAPL.Adjusted
2015-05-01 31.5250 32.5325 31.3250 32.2375 234050400 29.38875
2015-05-04 32.3750 32.6425 32.0650 32.1750 203953200 29.33177
2015-05-05 32.0375 32.1125 31.4450 31.4500 197085600 28.67084
2015-05-06 31.6400 31.6875 30.8400 31.2525 288564000 28.49079
2015-05-07 31.1925 31.5200 31.0050 31.3150 175763600 28.66702
2015-05-08 31.6700 31.9050 31.5275 31.9050 222201600 29.20712
Compute the monthly returns on AAPL and the S&P 500 Index.
apple.monthly <- monthlyReturn(AAPL$AAPL.Adjusted)
market.monthly <- monthlyReturn(GSPC$GSPC.Adjusted)
head(apple.monthly)
monthly.returns
2015-05-29 0.01453417
2015-06-30 -0.03722746
2015-07-31 -0.03292672
2015-08-31 -0.06619624
2015-09-30 -0.02181624
2015-10-30 0.08340903
head(market.monthly)
monthly.returns
2015-05-29 -0.0004269555
2015-06-30 -0.0210116724
2015-07-31 0.0197420297
2015-08-31 -0.0625808182
2015-09-30 -0.0264428316
2015-10-30 0.0829831178
Download the annualized return on 3-month Treasury Bills from the Federal Reserve Bank of Saint Louis. Then convert the annualized returns into approximate monthly returns.
getSymbols("DGS3MO", src="FRED")
[1] "DGS3MO"
head( DGS3MO )
DGS3MO
1981-09-01 17.01
1981-09-02 16.65
1981-09-03 16.96
1981-09-04 16.64
1981-09-07 NA
1981-09-08 16.54
To convert the annualized returns into approximate monthly returns: First, divide them by 100. e.g. 12.17% = 0.1217 and not 12.17
Then divide them by 12 to re-scale them to approximately one month
So divide each value by 100*12 = 1200
no.risk <- DGS3MO / 1200
head(no.risk)
DGS3MO
1981-09-01 0.01417500
1981-09-02 0.01387500
1981-09-03 0.01413333
1981-09-04 0.01386667
1981-09-07 NA
1981-09-08 0.01378333
Compute the excess returns on AAPL and the S&P 500 Index in excess of 3-month Treasury Bills.
Both apple.monthly & no.risk are xts objects in R. They are self-aware of when they exist in time. Monthly excess return: “Reward” for taking risk
For xts objects, R computes apple.monthly − no.risk only if there is data for both objects on a given date.
EXCESS RETURN = PAST RETURN – RISK-FREE RETURN
apple.excess.monthly <- apple.monthly - no.risk
market.excess.monthly <- market.monthly - no.risk
head(apple.excess.monthly)
monthly.returns
2015-05-29 0.01452583
2015-06-30 -0.03723579
2015-07-31 -0.03299339
2015-08-31 -0.06626291
2015-09-30 -0.02181624
2015-10-30 0.08334236
head(market.excess.monthly)
monthly.returns
2015-05-29 -0.0004352888
2015-06-30 -0.0210200057
2015-07-31 0.0196753630
2015-08-31 -0.0626474848
2015-09-30 -0.0264428316
2015-10-30 0.0829164511
Predicting the beta for the Risky Asset
Plot Apple’s monthly excess returns versus the “U.S. market’s” monthly excess returns. How volatile is Apple stocks’ performance compared to the overall U.S. stock market?
Apple’s excess return -> Y (dependent variable) “Market” excess return -> X (explanatory variable)
chart.Regression(apple.excess.monthly, market.excess.monthly, fit = F)
# Create the Linear Model
cider <- lm(apple.excess.monthly ~ market.excess.monthly)
# Plot the Linear Model
chart.Regression(apple.excess.monthly, market.excess.monthly, fit = F)
abline(cider)
# View the equation
# y = Intercept + Coef * Value
# apple.excess.monthly = 0.011 + 1.165 * market.excess.monthly
# If market.excess.monthly increased by 1%, apple.excess.monthly increases by 1.165 * 1%
# 1.165 is the Predicted Beta
cider
Call:
lm(formula = apple.excess.monthly ~ market.excess.monthly)
Coefficients:
(Intercept) market.excess.monthly
0.01119 1.16453
This means that we predict that Apple stock is 1.165 times more volatile than the overall market
# Get the Data
getSymbols(c("JBLU","^GSPC"), from = "2015-5-1", to = "2020-5-1" )
[1] "JBLU" "^GSPC"
getSymbols("DGS3MO", src="FRED")
[1] "DGS3MO"
# Shape Data
no.risk <- DGS3MO / 1200
# Calculate Monthly Returns
mint.monthly <- monthlyReturn(JBLU[ , 6])
market.monthly <- monthlyReturn(GSPC[ , 6])
# Calculate Excess
mint.excess.monthly <- mint.monthly - no.risk
market.excess.monthly <- market.monthly - no.risk
# Create the Linear Model
plane <- lm(mint.excess.monthly ~ market.excess.monthly)
# Plot the Linear Model
chart.Regression(mint.excess.monthly, market.excess.monthly, fit = F)
abline(plane)
plane
Call:
lm(formula = mint.excess.monthly ~ market.excess.monthly)
Coefficients:
(Intercept) market.excess.monthly
-0.01562 1.36259
# y = -0.01562 + 1.36259x
Based on the above predicted beta of approximately 1.36, it is predicted that JBLU stock is approximately 1.36 times more volatile compared the the overall market.
Z = (x - mean ) / sd
Z = (22 - 16) / 3 = 2
mean_lifespan <- 6.0
stdv <- 2.5
bluRay_Lifespan <- 3.0
Z <- (bluRay_Lifespan - mean_lifespan) / stdv
Z
[1] -1.2
Read in the data from the “FOOD.rdata” file
load("/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 7/7_food.rdata")
head(FOOD)
library(caret)
Loading required package: lattice
Loading required package: ggplot2
In this problem, CLASS is the dependent variable and GI and CALORIES are the independent variables
Train the model
FEAST <- train(CLASS ~ GI + CALORIES, data = FOOD, method = "knn")
FEAST
k-Nearest Neighbors
14 samples
2 predictor
3 classes: 'fruit', 'nut', 'veggie'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 14, 14, 14, 14, 14, 14, ...
Resampling results across tuning parameters:
k Accuracy Kappa
5 0.6124762 0.4674249
7 0.5352381 0.3731238
9 0.3572381 0.1616762
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was k = 5.
A random sample of 14 raw food items. Let’s classify these raw food items: NUTS, VEGETABLES, FRUIT
Predict which class a coconut is
COCONUT <- data.frame(GI=53, CALORIES=369)
COCONUT
# predict ( model, dataframe )
predict(FEAST, COCONUT)
[1] nut
Levels: fruit nut veggie
KNN is a distance-based model. All predictors should be of similar same scale to be equally influential. Therefore, we must preprocess the data (transform the data). In this case, center and scale. Center subtracts the mean of the column from each value. Scale divides the result by the standard deviation of the result. In other words, the Z score. Regardless of the range of the original data, Z Scores typically range from ≈−3 to +3. All predictors have similar scales now.
FEAST <- train(CLASS ~ GI + CALORIES, data = FOOD, method = "knn", preProcess = c("center", "scale"))
There were missing values in resampled performance measures.
FEAST
k-Nearest Neighbors
14 samples
2 predictor
3 classes: 'fruit', 'nut', 'veggie'
Pre-processing: centered (2), scaled (2)
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 14, 14, 14, 14, 14, 14, ...
Resampling results across tuning parameters:
k Accuracy Kappa
5 0.7071429 0.5643090
7 0.5447619 0.3793763
9 0.3963810 0.2438282
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was k = 5.
This time, the predicted class of coconut is a Fruit
# predict ( model, dataframe )
predict(FEAST, COCONUT)
[1] fruit
Levels: fruit nut veggie
Predict the CLASS of jalapeno based on the KNN model FEAST.
JALAPENO <- data.frame(GI=30, CALORIES=29)
predict(FEAST, JALAPENO)
[1] veggie
Levels: fruit nut veggie
Predict the CLASS of macadamia based on the KNN model FEAST.
macadamia <- data.frame(GI=10, CALORIES=705)
predict(FEAST, macadamia)
[1] nut
Levels: fruit nut veggie
Predict the CLASS of mango based on the KNN model FEAST.
mango <- data.frame(GI=51, CALORIES=60)
predict(FEAST, mango)
[1] fruit
Levels: fruit nut veggie
Load the data for Euro bank notes (2 data sets)
load("/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 7/7_euro.rdata")
# Training data
head(EURO1)
# Testing Data
head(EURO2)
NA
Use the data in EURO1 and train a KNN model that predicts the CLASS of a Euro banknote
Train a model (set a tune length to tell it how many K values to try. By default, the function tries odd k values starting with k=5. Even k’s are not used to avoid ties.)
FORGER <- train(CLASS ~ . , data = EURO1, method = "knn", preProcess = c("center", "scale"), tuneLength = 20)
FORGER
k-Nearest Neighbors
1000 samples
4 predictor
2 classes: 'forged', 'genuine'
Pre-processing: centered (4), scaled (4)
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 1000, 1000, 1000, 1000, 1000, 1000, ...
Resampling results across tuning parameters:
k Accuracy Kappa
5 0.9987025 0.9973795
7 0.9972891 0.9945014
9 0.9961957 0.9923088
11 0.9960811 0.9920800
13 0.9958809 0.9916858
15 0.9953377 0.9905854
17 0.9944424 0.9887717
19 0.9937646 0.9874025
21 0.9922616 0.9843830
23 0.9910686 0.9819859
25 0.9906800 0.9811901
27 0.9907651 0.9813675
29 0.9895615 0.9789385
31 0.9894389 0.9787109
33 0.9906523 0.9811515
35 0.9908642 0.9815748
37 0.9905401 0.9809231
39 0.9908690 0.9815772
41 0.9897775 0.9793803
43 0.9901075 0.9800405
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was k = 5.
GUESS <- predict(FORGER, EURO2)
tail(GUESS)
[1] genuine forged genuine forged genuine genuine
Levels: forged genuine
Build a confusion matrix for the results
confusionMatrix(GUESS, EURO2$CLASS)
Confusion Matrix and Statistics
Reference
Prediction forged genuine
forged 160 1
genuine 0 211
Accuracy : 0.9973
95% CI : (0.9851, 0.9999)
No Information Rate : 0.5699
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9945
Mcnemar's Test P-Value : 1
Sensitivity : 1.0000
Specificity : 0.9953
Pos Pred Value : 0.9938
Neg Pred Value : 1.0000
Prevalence : 0.4301
Detection Rate : 0.4301
Detection Prevalence : 0.4328
Balanced Accuracy : 0.9976
'Positive' Class : forged
Use the model and predict the CLASS of a banknote with the following characteristics: + VARIANCE=−2.97 + SKEWNESS=−10.33 + CURTOSIS=8.78 + ENTROPY=−2.11
MONEY <- data.frame(VARIANCE = -2.97, SKEWNESS = -10.33, CURTOSIS = 8.78, ENTROPY = -2.11)
predict(FORGER, MONEY)
[1] forged
Levels: forged genuine
Logarithm (log): log2^8= Which power of 2 gives us 8? 3
Euler’s number (e): an irrational number like pi. 2.7182818
loge^x = lnX = Natural log function of X
The Exponential function is the inverse of natural log
Probability Vs. Odds
Probability: + P(A) = n(A) / n(S) + What proportion of the time an outcome is to occur + The probability of rolling a 2 on a 6 sided die = 1/6 (0.167) + Maximum value = 100% + Minimum value = 0%
Odds: P / (1 - P) + Probability it will happen divided by the probability it will not happen
+ The odds that you will score a 2 = 0.167 / (1 - 0.167) = 0.2 [1/6 / (1 - 1/6) = 1/5] + Maximum value = Infinity + Minimum value = 0%
# Load Library
library(caret)
# Load the data
load("/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 8/8_titanic.rdata")
head(Titanic)
Task: Build Bayesian Binary Logistic Regression that predicts the probability of surviving
ICEBERG <- train(SURVIVED ~ ., data = Titanic, method = "bayesglm")
summary(ICEBERG)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6177 -0.7001 -0.4382 0.6708 2.3833
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.460249 0.321704 10.756 < 2e-16 ***
CLASSsecond -1.245829 0.222639 -5.596 2.20e-08 ***
CLASSthird -2.249268 0.222658 -10.102 < 2e-16 ***
GENDERmale -2.481615 0.164887 -15.050 < 2e-16 ***
AGE -0.033539 0.006263 -5.355 8.55e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1414.62 on 1045 degrees of freedom
Residual deviance: 982.66 on 1041 degrees of freedom
AIC: 992.66
Number of Fisher Scoring iterations: 6
Consider an 18 year old male passenger in 3rd class
JACK <- data.frame(CLASS='third', GENDER='male', AGE=18)
predict(ICEBERG, JACK, type = "prob")
NA
Consider an 18 year old female passenger in 1st class
ROSE <- data.frame(CLASS='first', GENDER='female', AGE=18)
predict(ICEBERG, ROSE, type = "prob")
KNN: requires + Requires pre processing of predictor variables + Need to set.seed() for reproducible results + Can apply to problems with more that two classes Bayesian Binary Logistic Regression + Do not need to preprocess predictor variables + Do not need to set.seed() + Is not applicable to problems with more with more than two classes
# Load Library
library(caret)
load("/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 8/8_home_loan.rdata")
head(HOME1)
head(HOME2)
# Set the seed for reproducible results. This will allow the bootstrapping of the KNN algorithm to use the same random numbers each time
set.seed(13)
HGTV <- train(DEFAULT ~ ., data = HOME1, method = "knn", preProcess = c("center", "scale"), tuneLength = 20)
HGTV
k-Nearest Neighbors
1000 samples
5 predictor
2 classes: 'no', 'yes'
Pre-processing: centered (5), scaled (5)
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 1000, 1000, 1000, 1000, 1000, 1000, ...
Resampling results across tuning parameters:
k Accuracy Kappa
5 0.5060711 0.009607512
7 0.5071309 0.010885838
9 0.5098630 0.015490393
11 0.5174016 0.030451607
13 0.5191932 0.033055891
15 0.5242841 0.043797767
17 0.5275205 0.050092521
19 0.5305308 0.056826019
21 0.5315578 0.058264064
23 0.5330213 0.061659561
25 0.5337244 0.062403006
27 0.5394562 0.073723921
29 0.5417160 0.078062340
31 0.5445272 0.083097377
33 0.5474947 0.089345388
35 0.5467860 0.088104788
37 0.5448651 0.084269635
39 0.5458030 0.086530900
41 0.5451493 0.085278917
43 0.5465110 0.087969099
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was k = 33.
# Define
DIY <- train(DEFAULT ~.,data=HOME1,method="bayesglm")
# Define a potential customer
JOHNDOE <- data.frame(TYPE="cash", GENDER="male",CAR="yes",CHILDREN=2,RATIO=2.40)
# Will he default on his home loan?
predict(DIY,JOHNDOE, type = "prob")
# 17 0f the 33 (51.52%) nearest neighbors defaulted where as 16 (48.48%) did not
predict(HGTV,JOHNDOE, type = "prob")
Test the models
tail(HOME2)
# The KNN model is 55.75% accurate
# Accuracy : 0.56
hgtv <- predict(HGTV, HOME2)
# Build confusion matrix to check
confusionMatrix(hgtv, HOME2$DEFAULT)
Confusion Matrix and Statistics
Reference
Prediction no yes
no 58 98
yes 79 165
Accuracy : 0.5575
95% CI : (0.5073, 0.6068)
No Information Rate : 0.6575
P-Value [Acc > NIR] : 1.0000
Kappa : 0.0491
Mcnemar's Test P-Value : 0.1761
Sensitivity : 0.4234
Specificity : 0.6274
Pos Pred Value : 0.3718
Neg Pred Value : 0.6762
Prevalence : 0.3425
Detection Rate : 0.1450
Detection Prevalence : 0.3900
Balanced Accuracy : 0.5254
'Positive' Class : no
# Logistic Regression is 60.25% accurate
# Accuracy : 0.6025
diy <- predict(DIY, HOME2)
confusionMatrix(diy, HOME2$DEFAULT)
Confusion Matrix and Statistics
Reference
Prediction no yes
no 35 57
yes 102 206
Accuracy : 0.6025
95% CI : (0.5527, 0.6508)
No Information Rate : 0.6575
P-Value [Acc > NIR] : 0.9905664
Kappa : 0.0421
Mcnemar's Test P-Value : 0.0004841
Sensitivity : 0.2555
Specificity : 0.7833
Pos Pred Value : 0.3804
Neg Pred Value : 0.6688
Prevalence : 0.3425
Detection Rate : 0.0875
Detection Prevalence : 0.2300
Balanced Accuracy : 0.5194
'Positive' Class : no
The “No Information Rate” is the most common class (in this case, 65.75% are default = yes)
Therefore, since both models did not perform better than the “No Information Rate”, neither model is a good one. In order to raise the accuracy, you will need to collect more data or add more predictor variables.
Quantitative predictions such as “how much revenue will a movie generate” or “what would be a fair selling price for a property”
The Hyperbolic Tangent ranges from -1 to 1.
If X >= 3, then TanH is approximately equal to 1. If X <= -3, then TanH is approximately equal to -1
# Load the Data
load("/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 9/9_ames.rdata")
tail(ames)
Predict the fair selling price of a property.
*The data include: + Price: Selling price divided by $1,000,000 + Rooms: Number of rooms divided by 10 + Baths: Number of bathrooms divided by 10 + Kitchens: Number of kitchens divided by 10 + Cars: car capacity of garage divided by 10
When building artificial neural networks, the data should have “small” values (e.g. between -3 and 3) to satisfy the Hyperbolic Tangent. The “smaller”, the better.
#install.packages("neuralnet")
library(neuralnet)
LOGCABIN <- neuralnet(PRICE ~ ., data = ames, hidden = 1, act.fct = "tanh")
plot(LOGCABIN)
Each time you build the model, you will get different weights. The algorithm starts with random coefficients and converges to a nearby local optimum solution. The local optimum solution may differ because it uses different random numbers. You can set.seed() to reproduce results (the seed number does not matter) and tell the algorithm to use the same random numbers each time it is run.
set.seed(1)
LOGCABIN <- neuralnet(PRICE ~ ., data = ames, hidden = 1, act.fct = "tanh")
plot(LOGCABIN)
Anatomy: INPUT nodes (predictors), HIDDEN note(s), OUTPUT note (predicted). There are also “BIAS” terms (the blue lines). The numbers are the “weights”.
Use the model to predict the price of a house that has 7 rooms, 3 bathrooms, 1 kitchen, and a garage for 2 cars.
In the case of the above, the value of each predictor is multiplied by the weight. The BIAS terms are multiplied by the number in the node by it’s weight (in this case, 1 * 0.04719). See Below:
The sum of the above, 0.16015, is the value that flows into the HIDDEN NODE. The hidden node includes the activation function (in this case, the Hyperbolic Tangent).
1 * 0.04719
[1] 0.04719
0.7 * 0.06263
[1] 0.043841
0.3 * 0.16368
[1] 0.049104
0.1 * -0.28729
[1] -0.028729
0.2 * 0.24372
[1] 0.048744
The hidden node is going to apply the Hyperbolic Tangent function to the value of 0.16015.
FLOW <- (1 * 0.04719) + (0.7 * 0.06263) + (0.3 * 0.16368) + (0.1 * -0.28729)+ (0.2 * 0.24372)
FLOW
[1] 0.16015
tanh(FLOW)
[1] 0.1587947
The outflow from the HIDDEN NODE has it’s own weight, the the result of the Hyperbolic Tangent function is multiplied by the weight. Then the result is multiplied by the 2nd BIAS TERM, 1 * -0.05886. This is because both of those terms flow into the OUTPUT NODE. By default, there is no activation function in the OUTPUT NODE.
(0.1587947 * 1.69864) + (1 * -0.05886)
[1] 0.210875
Finally, convert the result to dollars by multiplying it by 1,000,000 (which is what we divided by in order to make the input values “small”)
OUTPUT <- (0.1587947 * 1.69864) + (1 * -0.05886)
HOME_PRICE <- OUTPUT * 1000000
HOME_PRICE
[1] 210875
Using the Algorithm
Use the model to predict the price of a house that has 7 rooms, 3 bathrooms, 1 kitchen, and a garage for 2 cars.
NEST <- data.frame(ROOMS=7/10, BATHS=3/10, KITCHENS=1/10, CARS=2/10)
NEST
predict(LOGCABIN, NEST)
[,1]
[1,] 0.2108809
predict(LOGCABIN, NEST) * 1000000
[,1]
[1,] 210880.9
Use the model and predict the “fair” selling price of a house in Ames, Iowa, that has nine rooms, four bathrooms, one kitchen, and a garage for two cars.
“manual”
plot(LOGCABIN)
# Calculate Values: X * weight
BIAS1 <- 1 * 0.04719
rooms <- (9 / 10) * 0.06263
baths <- (4 / 10) * 0.16368
kitchens <- (1 / 10) * -0.28729
cars <- (2 / 10) * 0.24372
BIAS2 <- 1 * -0.05886
# Calculate the flow into the hidden node
HIDDEN = BIAS1 + rooms + baths + kitchens + cars
# Calculate the output
output <- (tanh(HIDDEN) * 1.69864) + BIAS2
# multiply the output to get predicted price
output * 1000000
[1] 258486.3
“automatic”
NEW_HOME <- data.frame(ROOMS=9/10, BATHS=4/10, KITCHENS=1/10, CARS=3/10)
predict(LOGCABIN, NEW_HOME) * 1000000
[,1]
[1,] 298259.1
# Logistic function
plogis(2)
[1] 0.8807971
When the logistic functions is >= 6, it is asymptotic and virtually equal to 1. When the logistic functions is <= -6, it is asymptotic and virtually equal to 0.
library(neuralnet)
load("/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 9/9_hollywood.rdata")
tail(HOLLYWOOD)
Suppose you are a movie producer and you want to predict how much revenue a movie will generate. For this purpose, you collect data from a random sample of 1,300 movies from the movie database (www.themoviedb.org). For each movie, the data set includes the following variables:
Use R & build an artificial neural network that predicts the REVENUE generated from a movie. Include one hidden node in the artificial neural network. Use logistic (sigmoid) as the activation function.
# Set the seed
set.seed(1)
# Create the model with the logistic function
POPCORN <- neuralnet(REVENUE ~ ., data = HOLLYWOOD, hidden = 1, act.fct = "logistic")
# Plot the model
plot(POPCORN)
tail(HOLLYWOOD)
THE_GREY = data.frame(BUDGET=25000000/1000000000, RUNTIME=117/60, HORROR=0, R.RATED=1)
predict(POPCORN, THE_GREY) * 1000000000
[,1]
[1,] 53262056
HALLOWEEN_ENDS <- data.frame(BUDGET=15000000/1000000000, RUNTIME=105/60, HORROR=1, R.RATED=1)
predict(POPCORN, HALLOWEEN_ENDS) * 1000000000
[,1]
[1,] 38415556
Dependent Variable on the Y axis and independent variable on the X axis (Income depends on age)
2016 Annual income for celebrities on millions of dollars
library(scatterD3)
load("/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 10/correlation.rdata")
head(CELEB)
plot(x = CELEB$AGE, y = CELEB$USD, xlab = "Age (2016)", ylab = "Income ($Million)")
There is a pattern, as AGE increases INCOME tends to go down.
scatterD3(x = CELEB$AGE, y = CELEB$USD, xlab = "Age (2016)", ylab = "Income ($Million)", color = "green", hover_size = 4)
Compute the correlation coefficient
# Age and USD
cor(CELEB$AGE, CELEB$US)
[1] -0.971771
# Age and USD
cor(CELEB$AGE, CELEB$EURO)
[1] -0.971771
The correlation coefficient is negative, which demonstrates a negative linear relationship between AGE And INCOME.(Pearson’s Correlation Coefficient -1 <= X <= 1)
Suppose you are a real estate agent at Nashville, TN. You want to predict the “fair” selling price of a property. For this purpose, you collect data from a random sample of 1,200 properties at Nashville, TN.
For each property, the data set includes the following information:
Use R & build an artificial neural network that predicts the PRICE of a property. Include four hidden nodes in the artificial neural network. Use logistic (sigmoid) as the activation function.
load("/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 10/10_nashville.rdata")
head(NASH1)
head(NASH2)
Use R & build an artificial neural network that predicts the PRICE of a property. Include four hidden nodes in the artificial neural network. Use logistic (sigmoid) as the activation function.
library(neuralnet)
set.seed(99)
MANCAVE <- neuralnet(PRICE ~ ., data = NASH1, hidden = 4, act.fct = "logistic")
plot(MANCAVE)
Multiple linear regression model
SHESHED <- lm(PRICE ~ ., data = NASH1)
SHESHED
Call:
lm(formula = PRICE ~ ., data = NASH1)
Coefficients:
(Intercept) ACRES AREA YEAR BATHS BRICK
1.931130 -0.027295 0.093312 -0.989820 0.303323 0.008696
BASEMENT
0.009422
tail(NASH2)
mancave <- predict(MANCAVE, NASH2)
tail(mancave)
[,1]
811 N 6th St 0.1996185
1000 A Delmas Ave 0.1924331
2714 Rosedale Pl 0.2562977
237 Haverford Ave 0.1310566
352 Dade Dr 0.1986055
1118 Clay St 0.2479515
Test the Artificial Neural Network models performance with the correlation coefficient
cor(mancave, NASH2$PRICE)
[,1]
[1,] 0.4287992
sheshed <- predict(SHESHED, NASH2)
tail(sheshed)
811 N 6th St 1000 A Delmas Ave 2714 Rosedale Pl 237 Haverford Ave
0.2319040 0.2108066 0.2536210 0.1200185
352 Dade Dr 1118 Clay St
0.1832769 0.2332810
Test the multiple linear regression models performance with the correlation coefficient
cor(sheshed, NASH2$PRICE)
[1] 0.3887337
Since the Artificial Neural Network model performed better, lets use it to make a prediction
# The following house sold for $165,000. Let's see how close the models get
FORSALE <- data.frame(ACRES = 0.019, AREA = 1.566, YEAR = 1.984, BATHS = 0.3, BRICK = 1, BASEMENT = 0)
FORSALE
predict(SHESHED, FORSALE) * 1000000
1
212628.3
predict(MANCAVE, FORSALE) * 1000000
[,1]
[1,] 187509.7
Suppose you are a movie producer and you want to predict how much revenue a movie will generate. For this purpose, you collect data from a random sample of 1,767 movies from the movie database (www.themoviedb.org). For each movie, the data set includes the following variables:
Use R & build an artificial neural network that predicts the REVENUE generated from a movie. Include three hidden nodes in the artificial neural network. Use logistic (sigmoid) as the activation function.
load("/Users/timhulak/Desktop/Syracuse/FIN-654\ Financial\ Analytics/Week\ 10/10_deep.rdata")
head(HOLLYWOOD)
head(WALKOFFAME)
Use R & build an artificial neural network that predicts the REVENUE generated from a movie. Include three hidden nodes in the artificial neural network. Use logistic (sigmoid) as the activation function.
library(neuralnet)
set.seed(5)
LOWBUDGET <- neuralnet(REVENUE ~ ., data = HOLLYWOOD, hidden = 3, act.fct = "logistic")
plot(LOWBUDGET)
Build a Deep ANN
Now build another model that has two hidden layers with three & two hidden nodes, respectively. Use logistic (sigmoid) as the activation function.
set.seed(3)
BLOCKBUSTER <- neuralnet(REVENUE ~ ., data = HOLLYWOOD, hidden = c(3,2), act.fct = "logistic")
plot(BLOCKBUSTER)
If the neural network has multiple hidden layers, it has deep learning architecture.
Test both models using the data frame WALKOFFAME. Use each model to predict the revenue from the movies in WALKOFFAME.
tail(WALKOFFAME)
lowbudget <- predict(LOWBUDGET, WALKOFFAME)
tail(lowbudget)
[,1]
The Mummy 0.30263066
Airport 0.04136593
A View to a Kill 0.09014952
Urban Legend 0.05160713
A Street Cat Named Bob 0.03007842
Saw 0.03616818
cor(lowbudget, WALKOFFAME$REVENUE)
[,1]
[1,] 0.7270871
blockbuster <- predict(BLOCKBUSTER, WALKOFFAME)
tail(blockbuster)
[,1]
The Mummy 0.33028681
Airport 0.02421945
A View to a Kill 0.08896215
Urban Legend 0.03639198
A Street Cat Named Bob 0.04235157
Saw 0.02458765
cor(blockbuster, WALKOFFAME$REVENUE)
[,1]
[1,] 0.7036656
The deep neural network model did not perform better than the single layer neural network (the simpler model displayed superior performance). This may be due to over-fitting the model. This means that you are going beyond the general pattern in the data and modeling the random noise in the dataset.
GRISWOLD <- data.frame(BUDGET = 0.027, RUNTIME = 1.6, HORROR = 0, R.RATED = 0)
GRISWOLD
predict(LOWBUDGET,GRISWOLD) * 1000000000
[,1]
[1,] 72359835
predict(BLOCKBUSTER,GRISWOLD) * 1000000000
[,1]
[1,] 68462710
The simpler model predicted 72,359,835 which is closer to 71,319,526
# This movie generated a revenue of $411.6 million
BATMAN <- data.frame(BUDGET = 0.035, RUNTIME = 2.1, HORROR = 0, R.RATED = 0)
BATMAN
predict(LOWBUDGET,BATMAN) * 1000000000
[,1]
[1,] 101620712
predict(BLOCKBUSTER,BATMAN) * 1000000000
[,1]
[1,] 106808829
Neither model was even close.
You can improve model performance by including more predictors, such as ACTION = 0 or 1.